Intialize

# Get data
cs2.data <- read.csv(file = "F:/R For Real/DDS-Case-Study-2/CaseStudy2-data.csv", sep = ",", header = TRUE)

exclude_factors = c("EmployeeCount",'Over18','StandardHours')
cs2.data = cs2.data %>% dplyr::select(-all_of(exclude_factors))

# Split data into sets of different data types 
cs2.data <- cs2.data %>% mutate(AttNum = ifelse(Attrition == "No",0,1))
cs2.numeric = cs2.data %>% dplyr::select(Age, DailyRate, DistanceFromHome, HourlyRate, MonthlyIncome, MonthlyRate, NumCompaniesWorked, PercentSalaryHike,
                                  TotalWorkingYears, TrainingTimesLastYear, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion,
                                  YearsWithCurrManager,AttNum)
# No Apparent NA Values, need to check for other NA identifiers
gg_miss_var(cs2.data)

na_count <- sapply(cs2.data, function(y) sum(length(which(is.na(y)))))
na_count <- data.frame(na_count)
na_count
##                          na_count
## ID                              0
## Age                             0
## Attrition                       0
## BusinessTravel                  0
## DailyRate                       0
## Department                      0
## DistanceFromHome                0
## Education                       0
## EducationField                  0
## EmployeeNumber                  0
## EnvironmentSatisfaction         0
## Gender                          0
## HourlyRate                      0
## JobInvolvement                  0
## JobLevel                        0
## JobRole                         0
## JobSatisfaction                 0
## MaritalStatus                   0
## MonthlyIncome                   0
## MonthlyRate                     0
## NumCompaniesWorked              0
## OverTime                        0
## PercentSalaryHike               0
## PerformanceRating               0
## RelationshipSatisfaction        0
## StockOptionLevel                0
## TotalWorkingYears               0
## TrainingTimesLastYear           0
## WorkLifeBalance                 0
## YearsAtCompany                  0
## YearsInCurrentRole              0
## YearsSinceLastPromotion         0
## YearsWithCurrManager            0
## AttNum                          0
str(cs2.data)
## 'data.frame':    870 obs. of  34 variables:
##  $ ID                      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                     : int  32 40 35 32 24 27 41 37 34 34 ...
##  $ Attrition               : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
##  $ DailyRate               : int  117 1308 200 801 567 294 1283 309 1333 653 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
##  $ DistanceFromHome        : int  13 14 18 1 2 10 5 10 10 10 ...
##  $ Education               : int  4 3 2 4 1 2 5 4 4 4 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
##  $ EmployeeNumber          : int  859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
##  $ EnvironmentSatisfaction : int  2 3 3 3 1 4 2 4 3 4 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
##  $ HourlyRate              : int  73 44 60 48 32 32 90 88 87 92 ...
##  $ JobInvolvement          : int  3 2 3 3 3 3 4 2 3 2 ...
##  $ JobLevel                : int  2 5 3 3 1 3 1 2 1 2 ...
##  $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 8 6 5 8 7 5 7 8 9 1 ...
##  $ JobSatisfaction         : int  4 3 4 4 4 1 3 4 3 3 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
##  $ MonthlyIncome           : int  4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
##  $ MonthlyRate             : int  9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
##  $ NumCompaniesWorked      : int  2 1 2 1 1 1 2 2 1 1 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
##  $ PercentSalaryHike       : int  11 14 11 19 13 21 12 14 19 14 ...
##  $ PerformanceRating       : int  3 3 3 3 3 4 3 3 3 3 ...
##  $ RelationshipSatisfaction: int  3 1 3 3 3 3 1 3 4 2 ...
##  $ StockOptionLevel        : int  1 0 0 2 0 2 0 3 1 1 ...
##  $ TotalWorkingYears       : int  8 21 10 14 6 9 7 8 1 8 ...
##  $ TrainingTimesLastYear   : int  3 2 2 3 2 4 5 5 2 3 ...
##  $ WorkLifeBalance         : int  2 4 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : int  5 20 2 14 6 9 4 1 1 8 ...
##  $ YearsInCurrentRole      : int  2 7 2 10 3 7 2 0 1 2 ...
##  $ YearsSinceLastPromotion : int  0 4 2 5 1 1 0 0 0 7 ...
##  $ YearsWithCurrManager    : int  3 9 2 7 3 7 3 0 0 7 ...
##  $ AttNum                  : num  0 0 0 0 0 0 0 0 0 0 ...
summary(cs2.data)
##        ID             Age        Attrition           BusinessTravel
##  Min.   :  1.0   Min.   :18.00   No :730   Non-Travel       : 94   
##  1st Qu.:218.2   1st Qu.:30.00   Yes:140   Travel_Frequently:158   
##  Median :435.5   Median :35.00             Travel_Rarely    :618   
##  Mean   :435.5   Mean   :36.83                                     
##  3rd Qu.:652.8   3rd Qu.:43.00                                     
##  Max.   :870.0   Max.   :60.00                                     
##                                                                    
##    DailyRate                       Department  DistanceFromHome   Education    
##  Min.   : 103.0   Human Resources       : 35   Min.   : 1.000   Min.   :1.000  
##  1st Qu.: 472.5   Research & Development:562   1st Qu.: 2.000   1st Qu.:2.000  
##  Median : 817.5   Sales                 :273   Median : 7.000   Median :3.000  
##  Mean   : 815.2                                Mean   : 9.339   Mean   :2.901  
##  3rd Qu.:1165.8                                3rd Qu.:14.000   3rd Qu.:4.000  
##  Max.   :1499.0                                Max.   :29.000   Max.   :5.000  
##                                                                                
##           EducationField EmployeeNumber   EnvironmentSatisfaction    Gender   
##  Human Resources : 15    Min.   :   1.0   Min.   :1.000           Female:354  
##  Life Sciences   :358    1st Qu.: 477.2   1st Qu.:2.000           Male  :516  
##  Marketing       :100    Median :1039.0   Median :3.000                       
##  Medical         :270    Mean   :1029.8   Mean   :2.701                       
##  Other           : 52    3rd Qu.:1561.5   3rd Qu.:4.000                       
##  Technical Degree: 75    Max.   :2064.0   Max.   :4.000                       
##                                                                               
##    HourlyRate     JobInvolvement     JobLevel    
##  Min.   : 30.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 48.00   1st Qu.:2.000   1st Qu.:1.000  
##  Median : 66.00   Median :3.000   Median :2.000  
##  Mean   : 65.61   Mean   :2.723   Mean   :2.039  
##  3rd Qu.: 83.00   3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :100.00   Max.   :4.000   Max.   :5.000  
##                                                  
##                       JobRole    JobSatisfaction  MaritalStatus MonthlyIncome  
##  Sales Executive          :200   Min.   :1.000   Divorced:191   Min.   : 1081  
##  Research Scientist       :172   1st Qu.:2.000   Married :410   1st Qu.: 2840  
##  Laboratory Technician    :153   Median :3.000   Single  :269   Median : 4946  
##  Manufacturing Director   : 87   Mean   :2.709                  Mean   : 6390  
##  Healthcare Representative: 76   3rd Qu.:4.000                  3rd Qu.: 8182  
##  Sales Representative     : 53   Max.   :4.000                  Max.   :19999  
##  (Other)                  :129                                                 
##   MonthlyRate    NumCompaniesWorked OverTime  PercentSalaryHike
##  Min.   : 2094   Min.   :0.000      No :618   Min.   :11.0     
##  1st Qu.: 8092   1st Qu.:1.000      Yes:252   1st Qu.:12.0     
##  Median :14074   Median :2.000                Median :14.0     
##  Mean   :14326   Mean   :2.728                Mean   :15.2     
##  3rd Qu.:20456   3rd Qu.:4.000                3rd Qu.:18.0     
##  Max.   :26997   Max.   :9.000                Max.   :25.0     
##                                                                
##  PerformanceRating RelationshipSatisfaction StockOptionLevel TotalWorkingYears
##  Min.   :3.000     Min.   :1.000            Min.   :0.0000   Min.   : 0.00    
##  1st Qu.:3.000     1st Qu.:2.000            1st Qu.:0.0000   1st Qu.: 6.00    
##  Median :3.000     Median :3.000            Median :1.0000   Median :10.00    
##  Mean   :3.152     Mean   :2.707            Mean   :0.7839   Mean   :11.05    
##  3rd Qu.:3.000     3rd Qu.:4.000            3rd Qu.:1.0000   3rd Qu.:15.00    
##  Max.   :4.000     Max.   :4.000            Max.   :3.0000   Max.   :40.00    
##                                                                               
##  TrainingTimesLastYear WorkLifeBalance YearsAtCompany   YearsInCurrentRole
##  Min.   :0.000         Min.   :1.000   Min.   : 0.000   Min.   : 0.000    
##  1st Qu.:2.000         1st Qu.:2.000   1st Qu.: 3.000   1st Qu.: 2.000    
##  Median :3.000         Median :3.000   Median : 5.000   Median : 3.000    
##  Mean   :2.832         Mean   :2.782   Mean   : 6.962   Mean   : 4.205    
##  3rd Qu.:3.000         3rd Qu.:3.000   3rd Qu.:10.000   3rd Qu.: 7.000    
##  Max.   :6.000         Max.   :4.000   Max.   :40.000   Max.   :18.000    
##                                                                           
##  YearsSinceLastPromotion YearsWithCurrManager     AttNum      
##  Min.   : 0.000          Min.   : 0.00        Min.   :0.0000  
##  1st Qu.: 0.000          1st Qu.: 2.00        1st Qu.:0.0000  
##  Median : 1.000          Median : 3.00        Median :0.0000  
##  Mean   : 2.169          Mean   : 4.14        Mean   :0.1609  
##  3rd Qu.: 3.000          3rd Qu.: 7.00        3rd Qu.:0.0000  
##  Max.   :15.000          Max.   :17.00        Max.   :1.0000  
## 
# take a look at collinearity
pairs(cs2.numeric)

Correlations of numeric variables and Numeric analog of response

# Nothing looks so highly correlated that we should want to get rid of it. 

corr <- round(cor(cs2.numeric), 3)

ggcorrplot(corr, hc.order = TRUE, type = "lower",
           lab = TRUE, lab_size = 3, method = "square",
           colors = c("tomato2", "white", "springgreen3"),
           title = "Correlations of Selected Continous Variables")

Age Vs Everything (numeric)

# Adding in the response

# Attrition Occurs younger invariate of rate
cs2.data %>% ggplot(aes(x = Age, y = DailyRate, color = Attrition)) + geom_point()

# Seems Most workers lives within 10 miles - needs some follow up what percentages - Distance from home invariate with age
cs2.data %>% ggplot(aes(x = Age, y = DistanceFromHome, color = Attrition)) + geom_point()

# Attrition seems to occur younger no matter the rate
cs2.data %>% ggplot(aes(x = Age, y = HourlyRate, color = Attrition)) + geom_point()

# Possible further investigation
cs2.data %>% ggplot(aes(x = Age, y = MonthlyIncome, color = Attrition)) + geom_point()

# Eh
cs2.data %>% ggplot(aes(x = Age, y = MonthlyRate, color = Attrition)) + geom_point()

# Further Investigation
cs2.data %>% ggplot(aes(x = Age, y = NumCompaniesWorked, color = Attrition)) + geom_point()

# Further Investigation
cs2.data %>% ggplot(aes(x = Age, y = PercentSalaryHike, color = Attrition)) + geom_point()

# Further Investigation - 
cs2.data %>% ggplot(aes(x = Age, y = TotalWorkingYears, color = Attrition)) + geom_point()

# eh
cs2.data %>% ggplot(aes(x = Age, y = TrainingTimesLastYear, color = Attrition)) + geom_point()

# eh
cs2.data %>% ggplot(aes(x = Age, y = YearsAtCompany, color = Attrition)) + geom_point()

# 
cs2.data %>% ggplot(aes(x = Age, y = YearsInCurrentRole, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = Age, y = YearsSinceLastPromotion, color = Attrition)) + geom_point()

# pretty interesting here how zero line is quite telling looks like a majority of attrition happens within 1 year with a manager
cs2.data %>% ggplot(aes(x = Age, y = YearsWithCurrManager, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = YearsWithCurrManager, fill = Attrition)) + geom_histogram(binwidth = 1) + 
  aes(y = stat(count)/sum(stat(count))) + scale_y_continuous(labels = scales::percent) + facet_wrap(~Attrition)

Stuff vs Daily Rate

cs2.data %>% ggplot(aes(x = DailyRate, y = DistanceFromHome, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DailyRate, y = HourlyRate, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DailyRate, y = MonthlyIncome, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DailyRate, y = MonthlyRate, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DailyRate, y = NumCompaniesWorked, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DailyRate, y = PercentSalaryHike, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DailyRate, y = TotalWorkingYears, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DailyRate, y = TrainingTimesLastYear, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DailyRate, y = YearsAtCompany, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DailyRate, y = YearsInCurrentRole, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DailyRate, y = YearsSinceLastPromotion, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DailyRate, y = YearsWithCurrManager, color = Attrition)) + geom_point()

Stuff vs Disatnce from home

cs2.data %>% ggplot(aes(x = DistanceFromHome, y = HourlyRate , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DistanceFromHome, y = MonthlyIncome , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DistanceFromHome, y = MonthlyRate , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DistanceFromHome, y = NumCompaniesWorked , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DistanceFromHome, y = PercentSalaryHike, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DistanceFromHome, y = TotalWorkingYears, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DistanceFromHome, y = TrainingTimesLastYear, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DistanceFromHome, y = YearsAtCompany, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DistanceFromHome, y = YearsInCurrentRole, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DistanceFromHome, y = YearsSinceLastPromotion, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = DistanceFromHome, y = YearsWithCurrManager, color = Attrition)) + geom_point()

# Stuff vs Hourly Rate

cs2.data %>% ggplot(aes(x = HourlyRate, y = MonthlyIncome, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = HourlyRate, y = MonthlyRate  , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = HourlyRate, y = NumCompaniesWorked , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = HourlyRate, y = PercentSalaryHike , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = HourlyRate, y = TotalWorkingYears , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = HourlyRate, y = TrainingTimesLastYear , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = HourlyRate, y = YearsAtCompany , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = HourlyRate, y =  YearsInCurrentRole, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = HourlyRate, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = HourlyRate, y = YearsWithCurrManager , color = Attrition)) + geom_point()

Stuff vs Monthly Income

cs2.data %>% ggplot(aes(x = MonthlyIncome, y = MonthlyRate  , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = MonthlyIncome, y = NumCompaniesWorked , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = MonthlyIncome, y = PercentSalaryHike , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = MonthlyIncome, y = TotalWorkingYears , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = MonthlyIncome, y = TrainingTimesLastYear , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = MonthlyIncome, y = YearsAtCompany , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = MonthlyIncome, y =  YearsInCurrentRole, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = MonthlyIncome, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = MonthlyIncome, y = YearsWithCurrManager , color = Attrition)) + geom_point()

Monthly Rate

cs2.data %>% ggplot(aes(x = MonthlyRate, y = NumCompaniesWorked , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = MonthlyRate, y = PercentSalaryHike , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = MonthlyRate, y = TotalWorkingYears , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = MonthlyRate, y = TrainingTimesLastYear , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = MonthlyRate, y = YearsAtCompany , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = MonthlyRate, y =  YearsInCurrentRole, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = MonthlyRate, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = MonthlyRate, y = YearsWithCurrManager , color = Attrition)) + geom_point()

NumCompaniesWorked

cs2.data %>% ggplot(aes(x = NumCompaniesWorked, y = PercentSalaryHike , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = NumCompaniesWorked, y = TotalWorkingYears , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = NumCompaniesWorked, y = TrainingTimesLastYear , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = NumCompaniesWorked, y = YearsAtCompany , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = NumCompaniesWorked, y =  YearsInCurrentRole, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = NumCompaniesWorked, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = NumCompaniesWorked, y = YearsWithCurrManager , color = Attrition)) + geom_point()

Percent salary hike

cs2.data %>% ggplot(aes(x = PercentSalaryHike, y = TotalWorkingYears , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = PercentSalaryHike, y = TrainingTimesLastYear , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = PercentSalaryHike, y = YearsAtCompany , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = PercentSalaryHike, y =  YearsInCurrentRole, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = PercentSalaryHike, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = PercentSalaryHike, y = YearsWithCurrManager , color = Attrition)) + geom_point()

TotalWorkingYears

cs2.data %>% ggplot(aes(x = TotalWorkingYears, y = TrainingTimesLastYear , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = TotalWorkingYears, y = YearsAtCompany , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = TotalWorkingYears, y =  YearsInCurrentRole, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = TotalWorkingYears, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = TotalWorkingYears, y = YearsWithCurrManager , color = Attrition)) + geom_point()

Training Times Last Year

cs2.data %>% ggplot(aes(x = TrainingTimesLastYear, y = YearsAtCompany , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = TrainingTimesLastYear, y =  YearsInCurrentRole, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = TrainingTimesLastYear, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = TrainingTimesLastYear, y = YearsWithCurrManager , color = Attrition)) + geom_point()

Years at Company

# Years positively correlated with other time measuring variable no suprise
cs2.data %>% ggplot(aes(x = YearsAtCompany, y =  YearsInCurrentRole, color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = YearsAtCompany, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = YearsAtCompany, y = YearsWithCurrManager , color = Attrition)) + geom_point()

Years In Current Role

cs2.data %>% ggplot(aes(x = YearsInCurrentRole, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()

cs2.data %>% ggplot(aes(x = YearsInCurrentRole, y = YearsWithCurrManager , color = Attrition)) + geom_point()

Years Since Last Promotion

cs2.data %>% ggplot(aes(x = YearsSinceLastPromotion, y = YearsWithCurrManager , color = Attrition)) + geom_point()

The Well Plotted Interesting Stuff

Something interesting 1 - Cumulative Attrition by Years with current manager

cs2.data %>% filter(Attrition == "Yes") %>% ggplot(aes(x = YearsWithCurrManager)) + geom_histogram(binwidth = 1, fill = "#00BFC4") + 
  aes(y = stat(count)/sum(stat(count))) + scale_y_continuous(labels = scales::percent) + ylab("Percent Attrition") + xlab("Years With Current Manager") + 
  ggtitle("Distribution of Attrition by Years with Current Manager") + 
  geom_vline(xintercept = 2, color = "red", size = 1.5, linetype = "dashed") + 
  scale_x_continuous(breaks = sort(c(seq(0,20,5), 3)), limits = c(0,20), expand = c(0,0))

# Of those who leave 40% do so before the end of the first year, 60% by year 3, and by year 7 90%
cs2.data %>% filter(Attrition == "Yes") %>% ggplot(aes(x = YearsWithCurrManager)) + stat_ecdf() + 
  scale_y_continuous(breaks = seq(0,1,0.1)) + scale_x_continuous(breaks = seq(1,15,1)) + 
  geom_vline(xintercept = 2, color = "red", size = 1.5, linetype = "dashed") +
  geom_vline(xintercept = 7, color = "darkred", size = 1.5, linetype = "dashed") +
  ggtitle("Cumulative Proportion of Attrition by Years with Current Manager") + ylab("Cumulative Proportion of Attrition") +
  xlab("Years With Current Manager")

cs2.data %>% filter(Attrition == "Yes") %>% ggplot(aes(x = YearsInCurrentRole)) + geom_histogram(binwidth = 1, fill = "#00BFC4") + 
  aes(y = stat(count)/sum(stat(count))) + scale_y_continuous(labels = scales::percent) + ylab("Percent Attrition") + xlab("Years in Current Role") + 
  ggtitle("Distribution of Attrition by Years in Current Role") + 
  geom_vline(xintercept = 2, color = "red", size = 1.5, linetype = "dashed") + 
  scale_x_continuous(breaks = sort(c(seq(0,20,5), 3)), limits = c(0,20), expand = c(0,0))

# Of those who leave 40% do so before the end of the first year, 60% by year 3, and by year 7 90%
cs2.data %>% filter(Attrition == "Yes") %>% ggplot(aes(x = YearsInCurrentRole)) + stat_ecdf() + 
  scale_y_continuous(breaks = seq(0,1,0.1)) + scale_x_continuous(breaks = seq(1,15,1)) + 
  geom_vline(xintercept = 2, color = "red", size = 1.5, linetype = "dashed") +
  geom_vline(xintercept = 7, color = "darkred", size = 1.5, linetype = "dashed") +
  ggtitle("Cumulative Proportion of Attrition by Years In Current Role") + ylab("Cumulative Proportion of Attrition") +
  xlab("Years In Current Role")

cs2.data %>% filter(Attrition == "Yes") %>% ggplot(aes(x = YearsAtCompany)) + geom_histogram(binwidth = 1, fill = "#00BFC4") + 
  aes(y = stat(count)/sum(stat(count))) + scale_y_continuous(labels = scales::percent) + ylab("Percent Attrition") + xlab("Years at Company") + 
  ggtitle("Distribution of Attrition by Years at Company") + 
  geom_vline(xintercept = 2, color = "red", size = 1.5, linetype = "dashed") + 
  scale_x_continuous(breaks = sort(c(seq(0,20,5), 3)), limits = c(0,20), expand = c(0,0))

# Of those who leave 40% do so before the end of the first year, 60% by year 3, and by year 7 90%
cs2.data %>% filter(Attrition == "Yes") %>% ggplot(aes(x = YearsAtCompany)) + stat_ecdf() + 
  scale_y_continuous(breaks = seq(0,1,0.1)) + scale_x_continuous(breaks = seq(1,20,1), limits = c(0,20)) + 
  geom_vline(xintercept = 2, color = "red", size = 1.5, linetype = "dashed") +
  geom_vline(xintercept = 10, color = "darkred", size = 1.5, linetype = "dashed") +
  ggtitle("Cumulative Proportion of Attrition by Years At Company") + ylab("Cumulative Proportion of Attrition") +
  xlab("Years At Company")

## Jobfullfillment and Marital Status

# No change in worklife balance between the groups
g1 = ggplot(cs2.data) +
      aes(x = MaritalStatus, y = WorkLifeBalance, fill = MaritalStatus) +
      geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_blank()) +
      scale_fill_manual(values= c("firebrick1", "darkorchid2", "deepskyblue"))
 
# Single generally make slightly less money than other two, likely age dependent
g2 = ggplot(cs2.data) +
      aes(x = MaritalStatus, y = MonthlyIncome, fill = MaritalStatus) +
      geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_blank()) +
      scale_fill_manual(values= c("firebrick1", "darkorchid2", "deepskyblue"))

# Divorced are reporting lower job satisfactions though their median satisfaction is unchanged
g3 = ggplot(cs2.data) +
      aes(x = MaritalStatus, y = JobSatisfaction, fill = MaritalStatus) +
      geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_blank()) + 
      scale_fill_manual(values= c("firebrick1", "darkorchid2", "deepskyblue"))
  
g11 = ggdraw() + draw_label("Marital Status and Job Fulfillment", fontface = 'bold', x = 0, hjust = 0) + theme(plot.margin = margin(0,0,0,7))

plot_grid(g11,NULL,NULL,NULL,NULL,g1,NULL,g2,NULL,g3, ncol = 5, nrow = 2, rel_widths = c(2,0.5,2,0.5,2), rel_heights = c(0.5,4))

## Jobfullfillment and Gender

# No change in worklife balance between the groups
g1 = ggplot(cs2.data) +
      aes(x = Gender, y = WorkLifeBalance, fill = Gender) +
      geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
      scale_fill_manual(values = c("pink", "dodgerblue1"))
 
# Single generally make slightly less money than other two, likely age dependent
g2 = ggplot(cs2.data) +
      aes(x = Gender, y = MonthlyIncome, fill = Gender) +
      geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
      scale_fill_manual(values = c("pink", "dodgerblue1"))

# Divorced are reporting lower job satisfactions though their median satisfaction is unchanged
g3 = ggplot(cs2.data) +
      aes(x = Gender, y = JobSatisfaction, fill = Gender) +
      geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
      scale_fill_manual(values = c("pink", "dodgerblue1"))
  
g11 = ggdraw() + draw_label("Gender and Job Fulfillment", fontface = 'bold', x = 0, hjust = 0) + theme(plot.margin = margin(0,0,0,7))

plot_grid(g11,NULL,NULL,NULL,NULL,g1,NULL,g2,NULL,g3, ncol = 5, nrow = 2, rel_widths = c(2,0.5,2,0.5,2), rel_heights = c(0.5,4))

# No change in worklife balance between the groups
g1 = ggplot(cs2.data) +
      aes(x = JobLevel, y = WorkLifeBalance, fill = JobLevel, group = JobLevel) +
      geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) 
 
# Single generally make slightly less money than other two, likely age dependent
g2 = ggplot(cs2.data) +
      aes(x = JobLevel, y = MonthlyIncome, fill = JobLevel, group = JobLevel) +
      geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

# Divorced are reporting lower job satisfactions though their median satisfaction is unchanged
g3 = ggplot(cs2.data) +
      aes(x = JobLevel, y = JobSatisfaction, fill = JobLevel, group = JobLevel) +
      geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) 
  
g11 = ggdraw() + draw_label("Job Level and Job Fulfillment", fontface = 'bold', x = 0, hjust = 0) + theme(plot.margin = margin(0,0,0,7))

plot_grid(g11,NULL,NULL,NULL,NULL,g1,NULL,g2,NULL,g3, ncol = 5, nrow = 2, rel_widths = c(2,0.5,2,0.5,2), rel_heights = c(0.5,4))

# No change in worklife balance between the groups
g1 = ggplot(cs2.data) +
      aes(x = BusinessTravel, y = WorkLifeBalance, fill = BusinessTravel) +
      geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) 
 
# Single generally make slightly less money than other two, likely age dependent
g2 = ggplot(cs2.data) +
      aes(x = BusinessTravel, y = MonthlyIncome, fill = BusinessTravel) +
      geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

# Divorced are reporting lower job satisfactions though their median satisfaction is unchanged
g3 = ggplot(cs2.data) +
      aes(x = BusinessTravel, y = JobSatisfaction, fill = BusinessTravel) +
      geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) 
  
g11 = ggdraw() + draw_label("Business Travel and Job Fulfillment", fontface = 'bold', x = 0, hjust = 0) + theme(plot.margin = margin(0,0,0,7))


# Yep singles are generally younger...
ggplot(cs2.data) +
      aes(x = MaritalStatus, y = Age, fill = MaritalStatus) +
      geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

plot_grid(g11,NULL,NULL,NULL,NULL,g1,NULL,g2,NULL,g3, ncol = 5, nrow = 2, rel_widths = c(2,0.5,2,0.5,2), rel_heights = c(0.5,4))

# Here we can see that overtime and education field effect attrition. Those who work overtime are much more likely to leave.
# Most severe in marketing -  Human resources seems to have a lot no matter what
cs2.data %>% 
  group_by(OverTime, EducationField, Attrition) %>%
  summarise(count = n()) %>% 
  mutate(Perc = (count/sum(count))) %>%
  ggplot(aes(x = EducationField, y = Perc, fill = Attrition, group = Attrition)) + 
  geom_col() + facet_wrap(~OverTime) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  geom_text(aes(x = EducationField, y = Perc, label = round(Perc,3), group = Attrition), position = position_stack(vjust = 0.5)) +
  ggtitle("Education Field by Overtime and Attrition")
## `summarise()` regrouping output by 'OverTime', 'EducationField' (override with `.groups` argument)

cs2.data %>% 
  group_by(OverTime, JobLevel, Attrition) %>%
  summarise(count = n()) %>% 
  mutate(Perc = (count/sum(count))) %>%
  ggplot(aes(x = JobLevel, y = Perc, fill = Attrition, group = Attrition)) + 
  geom_col() + facet_wrap(~OverTime) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  geom_text(aes(x = JobLevel, y = Perc, label = round(Perc,3), group = Attrition), position = position_stack(vjust = 0.5)) +
  ggtitle("Job Level by Overtime and Attrition")
## `summarise()` regrouping output by 'OverTime', 'JobLevel' (override with `.groups` argument)

cs2.data %>% 
  group_by(OverTime, MaritalStatus, Attrition) %>%
  summarise(count = n()) %>% 
  mutate(Perc = (count/sum(count))) %>%
  ggplot(aes(x = MaritalStatus, y = Perc, fill = Attrition, group = Attrition)) + 
  geom_col() + facet_wrap(~OverTime) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  geom_text(aes(x = MaritalStatus, y = Perc, label = round(Perc,3), group = Attrition), position = position_stack(vjust = 0.5)) +
  ggtitle("Marital Status by Overtime and Attrition")
## `summarise()` regrouping output by 'OverTime', 'MaritalStatus' (override with `.groups` argument)

cs2.data %>% 
  group_by(OverTime, JobRole, Attrition) %>%
  summarise(count = n()) %>% 
  mutate(Perc = (count/sum(count))) %>%
  ggplot(aes(x = JobRole, y = Perc, fill = Attrition, group = Attrition)) + 
  geom_col() + facet_wrap(~OverTime) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  geom_text(aes(x = JobRole, y = Perc, label = round(Perc,3), group = Attrition), position = position_stack(vjust = 0.5), size = 2.5) +
  ggtitle("Job Role by Overtime and Attrition")
## `summarise()` regrouping output by 'OverTime', 'JobRole' (override with `.groups` argument)

#g1 = cs2.data %>% group_by(EmployeeCount) %>% summarise(count = n()) %>% ggplot(aes(x = EmployeeCount, y = count)) + geom_boxplot() + 
      theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.y = element_blank()) + ggtitle("Employee Count is Constant at 870")
## List of 4
##  $ axis.title.y: list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ axis.text.x : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ axis.ticks.x: list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ title       : chr "Employee Count is Constant at 870"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE
#g2 = cs2.data %>% 
      #group_by(Over18) %>% summarise(count = n()) %>% 
      #ggplot(aes(x = Over18, y = count, fill = Over18)) + geom_bar(stat = "identity", width = 0.5) + 
      #theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), legend.position = "none", axis.title.y = element_blank()) +
      #geom_text(aes(label = count), vjust = -0.25) + 
      #ggtitle("All Employees are Over 18") + scale_fill_manual(values = c("steelblue"))

#g11 = ggdraw() + draw_label("No Information Variables", fontface = 'bold', x = 0, hjust = 0) + theme(plot.margin = margin(0,0,0,7))

#plot_grid(g11, NULL, g1,g2,cols = 2, rows = 2, rel_heights = c(0.25,2))
cs2.data %>%
  group_by(Attrition) %>%
  summarize(count = n()) %>%
  mutate(Perc = count/sum(count)) %>%
  ggplot(aes(x = Attrition, y = Perc, fill = Attrition)) + geom_col() + 
  geom_text(aes(label = round(Perc,3)), vjust = -0.25) + 
  ggtitle("Attrition Balance in Data set")
## `summarise()` ungrouping output (override with `.groups` argument)

Travel

ggplot(cs2.data) +
  aes(x = JobRole, y = DistanceFromHome, fill = Attrition) +
  geom_boxplot() +  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 7), axis.title.x = element_blank()) +
  ggtitle("Job Role by Distance from Home and Attrition")

ggplot(cs2.data) +
 aes(x = OverTime, y = DistanceFromHome, fill = Attrition) +
 geom_boxplot()

ggplot(cs2.data) +
 aes(x = MaritalStatus, y = DistanceFromHome, fill = Attrition) +
 geom_boxplot()

ggplot(cs2.data) +
 aes(x = BusinessTravel, y = DistanceFromHome, fill = Attrition) +
 geom_boxplot()

# Possibly Important Continous Variables

g1 = ggplot(cs2.data, aes(x = Attrition, y = YearsWithCurrManager, fill = Attrition)) + geom_boxplot() + 
      theme(legend.position = "none",axis.title.x = element_blank()) + ylab("YrsWManager")

g2 = ggplot(cs2.data, aes(x = Attrition, y = YearsAtCompany, fill = Attrition)) + geom_boxplot() + 
      theme(legend.position = "none",axis.title.x = element_blank()) + ylab("YrsAtCo.")

g3 = ggplot(cs2.data, aes(x = Attrition, y = YearsInCurrentRole, fill = Attrition)) + geom_boxplot() + 
      theme(legend.position = "none",axis.title.x = element_blank()) + ylab("YrsInRole")

g4 = ggplot(cs2.data, aes(x = Attrition, y = Age, fill = Attrition)) + geom_boxplot() + 
      theme(legend.position = "none",axis.title.x = element_blank())

g5 = ggplot(cs2.data, aes(x = Attrition, y = MonthlyIncome, fill = Attrition)) + geom_boxplot() + 
      theme(legend.position = "none",axis.title.x = element_blank())

g6 = ggplot(cs2.data, aes(x = Attrition, y = TotalWorkingYears, fill = Attrition)) + geom_boxplot() + 
      theme(legend.position = "none",axis.title.x = element_blank()) + ylab("WorkingYears")

g7 = ggplot(cs2.data, aes(x = Attrition, y = DistanceFromHome, fill = Attrition)) + geom_boxplot() +
      theme(axis.title.x = element_blank())

g11 = ggdraw() + draw_label("Largest Spread Continous Variables", fontface = 'bold', x = 0, hjust = 0) + theme(plot.margin = margin(0,0,0,7))

plot_grid(g11,NULL,NULL,g1,g2,g3,g4,g5,g6,NULL,g7,NULL, rows = 4, cols = 3, rel_heights = c(0.5,2,2,2))
## Warning in plot_grid(g11, NULL, NULL, g1, g2, g3, g4, g5, g6, NULL, g7, :
## Argument 'cols' is deprecated. Use 'ncol' instead.
## Warning in plot_grid(g11, NULL, NULL, g1, g2, g3, g4, g5, g6, NULL, g7, :
## Argument 'rows' is deprecated. Use 'nrow' instead.

g1 = cs2.data %>% group_by(JobInvolvement, Attrition) %>% summarise(count = n()) %>% mutate(Perc = (count/sum(count))) %>% 
      ggplot(aes(x = JobInvolvement, y = Perc, fill = Attrition)) + geom_bar(stat = "identity") + 
      theme(axis.text.x = element_text(angle = 90, vjust = 0.65, hjust = 1), legend.position = "none", axis.title.y = element_blank())
## `summarise()` regrouping output by 'JobInvolvement' (override with `.groups` argument)
g2 = cs2.data %>% group_by(JobLevel, Attrition) %>% summarise(count = n()) %>% mutate(Perc = (count/sum(count))) %>% 
      ggplot(aes(x = JobLevel, y = Perc, fill = Attrition)) + geom_bar(stat = "identity") + 
      theme(axis.text.x = element_text(angle = 90, vjust = 0.65, hjust = 1), legend.position = "none", axis.title.y = element_blank())
## `summarise()` regrouping output by 'JobLevel' (override with `.groups` argument)
g3 = cs2.data %>% group_by(JobRole, Attrition) %>% summarise(count = n()) %>% mutate(Perc = (count/sum(count))) %>% 
      ggplot(aes(x = JobRole, y = Perc, fill = Attrition)) + geom_bar(stat = "identity") + 
      theme(axis.text.x = element_text(angle = 90, vjust = 0.65, hjust = 1, size = 8), 
            legend.position = "none", axis.title.y = element_blank())
## `summarise()` regrouping output by 'JobRole' (override with `.groups` argument)
g4 = cs2.data %>% group_by(MaritalStatus, Attrition) %>% summarise(count = n()) %>% mutate(Perc = (count/sum(count))) %>% 
      ggplot(aes(x = MaritalStatus, y = Perc, fill = Attrition)) + geom_bar(stat = "identity") + 
      theme(axis.text.x = element_text(angle = 90, vjust = 0.65, hjust = 1), legend.position = "none", axis.title.y = element_blank())
## `summarise()` regrouping output by 'MaritalStatus' (override with `.groups` argument)
g5 = cs2.data %>% group_by(NumCompaniesWorked, Attrition) %>% summarise(count = n()) %>% mutate(Perc = (count/sum(count))) %>% 
      ggplot(aes(x = NumCompaniesWorked, y = Perc, fill = Attrition)) + geom_bar(stat = "identity") + 
      theme(axis.text.x = element_text(angle = 90, vjust = 0.65, hjust = 1), legend.position = "none", axis.title.y = element_blank())
## `summarise()` regrouping output by 'NumCompaniesWorked' (override with `.groups` argument)
g6 = cs2.data %>% group_by(OverTime, Attrition) %>% summarise(count = n()) %>% mutate(Perc = (count/sum(count))) %>% 
      ggplot(aes(x = OverTime, y = Perc, fill = Attrition)) + geom_bar(stat = "identity") + 
      theme(axis.text.x = element_text(angle = 90, vjust = 0.65, hjust = 1), legend.position = "none", axis.title.y = element_blank())
## `summarise()` regrouping output by 'OverTime' (override with `.groups` argument)
g7 = cs2.data %>% group_by(StockOptionLevel, Attrition) %>% summarise(count = n()) %>% mutate(Perc = (count/sum(count))) %>% 
      ggplot(aes(x = StockOptionLevel, y = Perc, fill = Attrition)) + geom_bar(stat = "identity") + 
      theme(axis.text.x = element_text(angle = 90, vjust = 0.65, hjust = 1), legend.position = "none", axis.title.y = element_blank())
## `summarise()` regrouping output by 'StockOptionLevel' (override with `.groups` argument)
g8 = cs2.data %>% group_by(WorkLifeBalance, Attrition) %>% summarise(count = n()) %>% mutate(Perc = (count/sum(count))) %>% 
      ggplot(aes(x = WorkLifeBalance, y = Perc, fill = Attrition)) + geom_bar(stat = "identity") + 
      theme(axis.text.x = element_text(angle = 90, vjust = 0.65, hjust = 1), legend.position = "none", axis.title.y = element_blank())
## `summarise()` regrouping output by 'WorkLifeBalance' (override with `.groups` argument)
g11 = ggdraw() + draw_label("Largest Difference Categorical Variables", fontface = 'bold', x = 0, hjust = 0) + theme(plot.margin = margin(0,0,0,7))

plot_grid(g11,NULL,NULL,NULL,g1,g2,g3,g4,NULL,NULL,NULL,NULL,g5,g6,g7,g8, cols = 4, rows = 4, rel_heights = c(0.3,2,0.5,2))
## Warning in plot_grid(g11, NULL, NULL, NULL, g1, g2, g3, g4, NULL, NULL, :
## Argument 'cols' is deprecated. Use 'ncol' instead.
## Warning in plot_grid(g11, NULL, NULL, NULL, g1, g2, g3, g4, NULL, NULL, :
## Argument 'rows' is deprecated. Use 'nrow' instead.